Tutorial based in Real Python Post


In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load files from Label datasets
filepath_dict = {'yelp':   'sentiment-labelled-sentences/yelp_labelled.txt',
                 'amazon': 'sentiment-labelled-sentences/amazon_cells_labelled.txt',
                 'imdb':   'sentiment-labelled-sentences/imdb_labelled.txt'}

# Concatenate the datasets in the same dataframe
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

# Select the Yelp dataset 
df_yelp = df[df['source'] == 'yelp']

# Get an Numpy array with the values of the sentense to generate the Training X dataset
sentences = df_yelp['sentence'].values

# Here we get the Y variable with the classification values (1,0)
y = df_yelp['label'].values

# Right now we split our dataset to 75% for training and 25% fo testing
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,
                                                                    y,
                                                                    test_size=0.25,
                                                                    random_state=1000)

In [2]:
'''
 It takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. 
 This vocabulary can then be used to create a feature vector of the count of the words
'''

'''
Here we will use again on the previous BOW model to vectorize the sentences. 
You can use again the CountVectorizer for this task. Since you might not have 
the testing data available during training, you can create the vocabulary 
using only the training data. Using this vocabulary, you can create the 
feature vectors for each sentence of the training and testing set
'''
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

In [3]:
# Here we can see our complete sentence
sentences_train[1]


Out[3]:
'Sorry, I will not be getting food from here anytime soon :('

In [4]:
# Here we have our sentence converted in matrix of token counts
print(X_train[1])


  (0, 63)	1
  (0, 136)	1
  (0, 597)	1
  (0, 616)	1
  (0, 638)	1
  (0, 725)	1
  (0, 1001)	1
  (0, 1372)	1
  (0, 1377)	1
  (0, 1674)	1

In [5]:
# If we use a Pandas dataframe just to see this dataset we have an array with 1713 positions with 
# Binary indications of if there's a occurence of the word or not.
# This 1713 positions it's the size of the dictionary of all dataset. 
# Acording with Scikit-Learning documentation, this is "a sparse representation of the 
# counts using scipy.sparse.csr_matrix.""
pd.DataFrame(X_train[1].toarray())


Out[5]:
0 1 2 3 4 5 6 7 8 9 ... 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 1714 columns


In [6]:
# In this case let's grab all the positions of or array. 
# I'll transpose only to visualization purposes, and at the end
# I'll grab only the rows with value equals 1

sentence_df = pd.DataFrame(X_train[1].toarray()).T
sentence_df[sentence_df[0] == 1]

# As we can see, we grab only the words in the following positions in the dictionary.


Out[6]:
0
63 1
136 1
597 1
616 1
638 1
725 1
1001 1
1372 1
1377 1
1674 1

In [7]:
# In the test data we have only the categorical information
print(y_test[1])


0

In [8]:
# And this is our dictionary
vectorizer.vocabulary_


Out[8]:
{'the': 1494,
 'food': 597,
 'was': 1634,
 'barely': 125,
 'lukewarm': 888,
 'so': 1360,
 'it': 801,
 'must': 973,
 'have': 710,
 'been': 145,
 'sitting': 1345,
 'waiting': 1622,
 'for': 600,
 'server': 1308,
 'to': 1524,
 'bring': 201,
 'out': 1042,
 'us': 1587,
 'sorry': 1377,
 'will': 1674,
 'not': 1001,
 'be': 136,
 'getting': 638,
 'from': 616,
 'here': 725,
 'anytime': 63,
 'soon': 1372,
 'of': 1010,
 'all': 42,
 'dishes': 434,
 'salmon': 1265,
 'best': 155,
 'but': 223,
 'were': 1657,
 'great': 665,
 'fries': 615,
 'hot': 750,
 'and': 57,
 'neither': 988,
 'my': 974,
 'burger': 215,
 'in': 780,
 'fact': 540,
 'going': 650,
 'round': 1250,
 'up': 1583,
 'stars': 1403,
 'just': 818,
 'because': 143,
 'she': 1323,
 'awesome': 106,
 'go': 647,
 'back': 115,
 'next': 991,
 'trip': 1553,
 'this': 1511,
 'first': 577,
 'crawfish': 350,
 'experience': 531,
 'delicious': 393,
 'could': 336,
 'stomach': 1418,
 'meal': 913,
 'didn': 411,
 'complain': 312,
 'business': 220,
 'lunch': 889,
 'way': 1645,
 'finish': 574,
 'service': 1311,
 'ever': 517,
 'maria': 905,
 'our': 1040,
 'good': 654,
 'friendly': 613,
 'made': 892,
 'day': 375,
 'drink': 464,
 'never': 989,
 'empty': 496,
 'he': 712,
 'some': 1364,
 'really': 1193,
 'menu': 930,
 'suggestions': 1448,
 'sure': 1456,
 'order': 1033,
 'dessert': 406,
 'even': 514,
 'if': 769,
 'you': 1707,
 'need': 982,
 'pack': 1058,
 'tiramisu': 1523,
 'cannoli': 239,
 'are': 76,
 'both': 187,
 'die': 412,
 'excellent': 524,
 'very': 1608,
 'tapas': 1470,
 'we': 1648,
 'loved': 879,
 'biscuits': 166,
 'place': 1111,
 'receives': 1198,
 'their': 1496,
 'appetizers': 74,
 'they': 1502,
 'performed': 1092,
 'refried': 1208,
 'beans': 138,
 'that': 1492,
 'came': 236,
 'with': 1679,
 'dried': 462,
 'crusty': 362,
 'bland': 171,
 'prices': 1144,
 'eaten': 479,
 'multiple': 969,
 'times': 1521,
 'each': 477,
 'time': 1520,
 'poor': 1125,
 'thats': 1493,
 'being': 150,
 'nice': 992,
 'prompt': 1155,
 'dont': 445,
 'immediately': 776,
 'said': 1262,
 'wanted': 1629,
 'talk': 1468,
 'manager': 901,
 'did': 410,
 'want': 1628,
 'guy': 685,
 'who': 1669,
 'doing': 441,
 'shots': 1330,
 'fireball': 575,
 'behind': 149,
 'bar': 123,
 'chips': 279,
 'salsa': 1267,
 'fresh': 610,
 'flair': 580,
 'bartenders': 129,
 'absolutely': 22,
 'amazing': 51,
 'started': 1404,
 'review': 1229,
 'two': 1565,
 'editing': 483,
 'give': 641,
 'only': 1026,
 'one': 1023,
 'also': 47,
 'decided': 380,
 'send': 1301,
 'waitress': 1623,
 'looked': 873,
 'like': 858,
 'on': 1021,
 'verge': 1606,
 'having': 711,
 'heart': 717,
 'attack': 95,
 'wouldn': 1695,
 'return': 1226,
 'fried': 611,
 'rice': 1237,
 'dry': 470,
 'as': 85,
 'well': 1655,
 'got': 657,
 'poisoning': 1122,
 'at': 90,
 'buffet': 210,
 'bathrooms': 132,
 'location': 870,
 'dirty': 421,
 'seat': 1290,
 'covers': 346,
 'replenished': 1220,
 'plain': 1113,
 'yucky': 1709,
 'is': 799,
 'pretty': 1141,
 'little': 865,
 'vibe': 1610,
 'restaurant': 1224,
 'can': 238,
 'watch': 1640,
 'them': 1497,
 'preparing': 1139,
 'nachos': 976,
 'much': 966,
 'better': 156,
 'than': 1490,
 'other': 1037,
 'ayce': 110,
 'sushi': 1459,
 'went': 1656,
 'vegas': 1598,
 'waiter': 1621,
 'me': 912,
 'feel': 557,
 'stupid': 1433,
 'every': 518,
 'table': 1462,
 'han': 695,
 'nan': 978,
 'chicken': 273,
 'tasty': 1476,
 'had': 690,
 'pho': 1100,
 'tasted': 1474,
 'far': 550,
 'too': 1533,
 'many': 903,
 'places': 1112,
 've': 1596,
 'seen': 1298,
 'any': 61,
 'serves': 1310,
 'egg': 486,
 'breakfast': 199,
 'especially': 509,
 '00': 0,
 'sauce': 1275,
 'tasteless': 1475,
 'batter': 133,
 'meat': 916,
 'ratio': 1184,
 'tenders': 1485,
 'unsatisfying': 1578,
 'needless': 984,
 'say': 1279,
 'again': 36,
 'beyond': 158,
 'bad': 117,
 'jalapeno': 806,
 'bacon': 116,
 'soooo': 1373,
 'pancake': 1065,
 'large': 833,
 'perfect': 1089,
 'family': 547,
 'atmosphere': 92,
 'see': 1295,
 'gave': 631,
 'trying': 1559,
 'eat': 478,
 'crust': 361,
 'teeth': 1480,
 'still': 1415,
 'sore': 1375,
 'hands': 701,
 'down': 453,
 'phoenix': 1101,
 'metro': 934,
 'area': 77,
 'waitresses': 1624,
 'clean': 290,
 'inexpensive': 785,
 'boot': 185,
 'inside': 790,
 'quite': 1172,
 'honeslty': 740,
 'taste': 1473,
 'strawberry': 1423,
 'tea': 1478,
 'which': 1666,
 'wasn': 1636,
 'busy': 222,
 'now': 1004,
 'know': 826,
 'why': 1671,
 'don': 443,
 'how': 755,
 'managed': 899,
 'served': 1307,
 'blandest': 172,
 'when': 1662,
 'indian': 782,
 'cuisine': 364,
 'stinks': 1416,
 'bowl': 192,
 'after': 34,
 'bite': 169,
 'hooked': 744,
 'highly': 729,
 'recommended': 1203,
 'point': 1121,
 'your': 1708,
 'finger': 573,
 'item': 803,
 'won': 1682,
 'disappointed': 423,
 'vanilla': 1595,
 'ice': 767,
 'cream': 352,
 'creamy': 353,
 'smooth': 1359,
 'while': 1667,
 'profiterole': 1152,
 'choux': 282,
 'pastry': 1077,
 'enough': 503,
 'll': 868,
 'definitely': 388,
 'sadly': 1259,
 'gordon': 656,
 'ramsey': 1173,
 'steak': 1409,
 'shall': 1320,
 'sharply': 1321,
 'avoid': 103,
 'during': 474,
 'bruschetta': 207,
 'devine': 409,
 'wow': 1697,
 'spicy': 1393,
 'reminds': 1219,
 'mom': 953,
 'pop': 1127,
 'shops': 1329,
 'san': 1269,
 'francisco': 608,
 'bay': 134,
 'descriptions': 401,
 'yum': 1711,
 'another': 60,
 'eel': 484,
 'yet': 1706,
 'mayo': 911,
 'none': 1000,
 'rolls': 1248,
 'sauces': 1276,
 'omg': 1020,
 'felt': 562,
 'thai': 1489,
 'until': 1579,
 'dish': 433,
 'caring': 247,
 'teamwork': 1479,
 'professional': 1151,
 'degree': 389,
 'perhaps': 1093,
 'caught': 257,
 'an': 56,
 'off': 1011,
 'night': 994,
 'judging': 815,
 'by': 226,
 'reviews': 1231,
 'inspired': 791,
 'mediocre': 919,
 'seafood': 1286,
 'generous': 635,
 'portion': 1129,
 'walked': 1625,
 'smelled': 1356,
 'old': 1018,
 'grease': 663,
 'trap': 1548,
 'others': 1038,
 'there': 1500,
 'eating': 480,
 'consider': 322,
 'theft': 1495,
 'staff': 1400,
 'no': 996,
 'flavor': 582,
 'itself': 805,
 'totally': 1539,
 'overcooked': 1050,
 'charcoal': 261,
 'disgusting': 432,
 'wait': 1619,
 'lived': 867,
 'since': 1343,
 '1979': 6,
 'last': 836,
 'stepped': 1413,
 'foot': 599,
 'into': 797,
 'sashimi': 1272,
 'quality': 1168,
 'soggy': 1361,
 'reasonably': 1196,
 'priced': 1143,
 'hard': 705,
 'judge': 814,
 'whether': 1665,
 'these': 1501,
 'sides': 1337,
 'grossed': 677,
 'melted': 927,
 'styrofoam': 1435,
 'fear': 556,
 'sick': 1335,
 'leave': 842,
 'what': 1659,
 'double': 448,
 'cheeseburger': 268,
 'crepe': 355,
 'station': 1405,
 'disbelief': 428,
 'qualified': 1167,
 'worst': 1692,
 'version': 1607,
 'foods': 598,
 'however': 756,
 'recent': 1199,
 'particular': 1071,
 'weekly': 1653,
 'haunt': 709,
 'come': 306,
 'once': 1022,
 'bye': 227,
 'tip': 1522,
 'lady': 832,
 'ordered': 1034,
 'lemon': 848,
 'raspberry': 1179,
 'cocktail': 295,
 'incredible': 781,
 'stopped': 1421,
 'madison': 894,
 'ironman': 798,
 'kind': 825,
 'fo': 593,
 'take': 1467,
 'or': 1032,
 'super': 1455,
 'quick': 1170,
 'crazy': 351,
 'crowds': 359,
 'downtown': 456,
 'juries': 817,
 'lawyers': 839,
 'court': 343,
 'least': 840,
 'think': 1507,
 'refill': 1206,
 'water': 1642,
 'before': 147,
 'struggle': 1430,
 'wave': 1644,
 'over': 1048,
 '10': 1,
 'minutes': 944,
 'nicest': 993,
 'owners': 1055,
 'across': 29,
 'nothing': 1003,
 'authentic': 100,
 'about': 19,
 'chow': 283,
 'mein': 924,
 'slow': 1351,
 'attentive': 97,
 'cow': 347,
 'tongue': 1531,
 'cheek': 266,
 'tacos': 1465,
 'sat': 1273,
 'right': 1240,
 'get': 637,
 'fantastic': 549,
 'def': 386,
 'coming': 308,
 'truly': 1557,
 'unbelievably': 1568,
 'am': 50,
 'glad': 644,
 'extensive': 535,
 'provides': 1159,
 'lots': 877,
 'options': 1031,
 'joint': 811,
 'always': 49,
 'impressive': 779,
 'hasn': 707,
 'closed': 292,
 'flavorful': 584,
 'has': 706,
 'amount': 54,
 'heat': 719,
 'probably': 1149,
 'cheated': 263,
 'wasting': 1639,
 'opportunity': 1030,
 'company': 311,
 'several': 1317,
 'past': 1075,
 'thought': 1514,
 'above': 20,
 'average': 101,
 'worth': 1693,
 'perpared': 1094,
 'beautiful': 141,
 'presentation': 1140,
 'giant': 639,
 'slices': 1350,
 'toast': 1525,
 'lightly': 857,
 'dusted': 475,
 'powdered': 1135,
 'sugar': 1445,
 'overall': 1049,
 'anyway': 64,
 'fs': 621,
 'wonderful': 1683,
 'rock': 1246,
 'casino': 255,
 'step': 1412,
 'forward': 605,
 'special': 1385,
 'thanks': 1491,
 'dylan': 476,
 'recommendation': 1202,
 'yummy': 1712,
 'tummy': 1560,
 'classics': 289,
 'new': 990,
 'few': 563,
 'sorely': 1376,
 'everything': 520,
 'possible': 1131,
 'zero': 1713,
 'husband': 764,
 'ate': 91,
 'ambience': 53,
 'cooked': 327,
 'perfection': 1090,
 'impeccable': 777,
 'mess': 933,
 'disappointing': 424,
 'believe': 151,
 'oysters': 1056,
 'patio': 1078,
 'seating': 1292,
 'comfortable': 307,
 'look': 872,
 'else': 491,
 'where': 1664,
 'hurry': 763,
 'maybe': 910,
 'weren': 1658,
 'cold': 300,
 'would': 1694,
 'somewhat': 1369,
 'edible': 481,
 'left': 844,
 'ache': 27,
 'rest': 1222,
 'love': 878,
 'dirt': 420,
 'disgrace': 429,
 'thing': 1505,
 'prime': 1147,
 'rib': 1235,
 'section': 1294,
 'pork': 1128,
 'sandwich': 1270,
 'screams': 1284,
 'legit': 846,
 'book': 184,
 'somethat': 1367,
 'rare': 1177,
 'wife': 1673,
 'hated': 708,
 'her': 724,
 'coconut': 297,
 'shrimp': 1334,
 'friends': 614,
 'enjoy': 500,
 'meals': 914,
 'either': 489,
 'lot': 876,
 'promise': 1154,
 'fails': 542,
 'deliver': 399,
 'furthermore': 626,
 'find': 571,
 'hours': 753,
 'operation': 1028,
 'website': 1650,
 'happy': 704,
 'customer': 366,
 'course': 342,
 'turn': 1562,
 'doubt': 449,
 'unless': 1577,
 'someone': 1366,
 'buying': 225,
 'bathroom': 131,
 'door': 446,
 'found': 606,
 'live': 866,
 'green': 669,
 'caterpillar': 256,
 'salad': 1263,
 'desserts': 407,
 'bit': 167,
 'strange': 1422,
 'pricing': 1146,
 'concern': 319,
 'mellow': 925,
 'mushroom': 970,
 'arrives': 82,
 'meh': 923,
 'fiancé': 564,
 'middle': 937,
 'greeted': 671,
 'seated': 1291,
 'away': 105,
 'running': 1255,
 'realized': 1192,
 'his': 734,
 'sunglasses': 1454,
 'close': 291,
 'room': 1249,
 'temp': 1482,
 'watched': 1641,
 'prepare': 1138,
 'bare': 124,
 'gloves': 646,
 'deep': 384,
 'oil': 1016,
 'ribeye': 1236,
 'perfectly': 1091,
 'mesquite': 932,
 'then': 1499,
 'disappointment': 425,
 'ensued': 504,
 'check': 264,
 'regularly': 1211,
 'gyro': 687,
 'basically': 130,
 'lettuce': 852,
 'eew': 485,
 'needs': 985,
 'complete': 314,
 'overhaul': 1051,
 'though': 1513,
 'people': 1087,
 'ians': 766,
 'lacked': 829,
 'seemed': 1296,
 'undercooked': 1570,
 'recently': 1200,
 'witnessed': 1681,
 'management': 900,
 'towards': 1543,
 'guests': 684,
 'sliced': 1349,
 'brisket': 203,
 'pulled': 1162,
 'twice': 1564,
 'visit': 1614,
 'enjoyed': 502,
 'serve': 1306,
 'vinaigrette': 1611,
 'may': 909,
 'make': 898,
 'recommend': 1201,
 'chipotle': 278,
 'town': 1544,
 'spend': 1388,
 'money': 954,
 'elsewhere': 492,
 'mexican': 935,
 'whole': 1670,
 'bunch': 214,
 'interesting': 796,
 'meats': 918,
 'choose': 281,
 'try': 1558,
 'swung': 1461,
 'deeply': 385,
 'airport': 39,
 'speedy': 1387,
 'burgers': 216,
 'aren': 78,
 'pizza': 1109,
 'used': 1589,
 'doughy': 452,
 'flavorless': 585,
 'servers': 1309,
 'imaginative': 774,
 'leaves': 843,
 'desired': 403,
 'grilled': 673,
 'tender': 1484,
 'yellow': 1704,
 'saffron': 1260,
 'seasoning': 1289,
 'gotten': 658,
 'services': 1312,
 'received': 1197,
 'albondigas': 41,
 'soup': 1380,
 'warm': 1631,
 'tomato': 1530,
 'frozen': 618,
 'meatballs': 917,
 'wine': 1675,
 'everyone': 519,
 'treated': 1550,
 'equally': 508,
 'actually': 31,
 'ample': 55,
 'portions': 1130,
 'vegetarian': 1600,
 'fare': 551,
 'took': 1534,
 'hour': 752,
 'tables': 1463,
 'luke': 887,
 'sever': 1316,
 'around': 79,
 'overwhelmed': 1053,
 'outshining': 1044,
 'halibut': 693,
 'gyros': 688,
 'arrived': 81,
 'missing': 948,
 'different': 413,
 'cut': 369,
 'piece': 1105,
 'flavored': 583,
 'forth': 603,
 'helped': 722,
 'rude': 1253,
 'drinks': 466,
 'weak': 1649,
 'group': 679,
 '70': 16,
 'claimed': 286,
 '40': 12,
 'handled': 698,
 'beautifully': 142,
 'experienced': 532,
 'frenchman': 609,
 'ryan': 1257,
 'edinburgh': 482,
 'establishment': 510,
 'revisiting': 1232,
 'ones': 1024,
 'scene': 1283,
 'restaurants': 1225,
 'lost': 875,
 'proven': 1158,
 'dead': 376,
 'wrong': 1699,
 'fast': 552,
 'bus': 219,
 'boy': 194,
 'hand': 696,
 'apologize': 67,
 'anything': 62,
 'toasted': 1526,
 'english': 499,
 'muffin': 967,
 'untoasted': 1580,
 'insulted': 794,
 'price': 1142,
 'rather': 1182,
 'gone': 653,
 'impressed': 778,
 'strip': 1428,
 'thumbs': 1518,
 'strings': 1427,
 'pasta': 1076,
 'bottom': 189,
 'daily': 371,
 'specials': 1386,
 'hit': 735,
 'delicioso': 392,
 'stop': 1420,
 'whenever': 1663,
 'do': 438,
 'staying': 1408,
 'mirage': 945,
 'join': 810,
 'club': 293,
 'offers': 1012,
 'via': 1609,
 'email': 493,
 'assure': 89,
 'seriously': 1304,
 'cannot': 240,
 'owner': 1054,
 'unexperienced': 1572,
 'employees': 495,
 'chickens': 274,
 'heads': 714,
 'high': 727,
 'hopes': 746,
 'grill': 672,
 'unfortunately': 1573,
 'fell': 560,
 'flat': 581,
 'selection': 1299,
 'favorite': 555,
 'italian': 802,
 'tried': 1551,
 'caballero': 228,
 'week': 1651,
 'side': 1336,
 'spot': 1396,
 'combos': 305,
 'beer': 146,
 '23': 9,
 'decent': 379,
 'deal': 377,
 'bakery': 119,
 'leftover': 845,
 'stale': 1401,
 'due': 472,
 '20': 7,
 'acknowledged': 28,
 '35': 11,
 'kept': 820,
 'forgetting': 602,
 'things': 1506,
 'ambiance': 52,
 'setting': 1315,
 'more': 958,
 'douchey': 450,
 'indoor': 784,
 'garden': 628,
 'terrible': 1486,
 'red': 1204,
 'velvet': 1601,
 'cake': 231,
 'ohhh': 1015,
 'stuff': 1431,
 'unbelievable': 1567,
 'bargain': 126,
 'toro': 1537,
 'tartare': 1472,
 'cavier': 258,
 'extraordinary': 537,
 'liked': 859,
 'thinly': 1508,
 'wagyu': 1618,
 'white': 1668,
 'truffle': 1556,
 'downside': 455,
 'bread': 197,
 'house': 754,
 'iced': 768,
 'diverse': 437,
 'giving': 643,
 'appetite': 72,
 'instantly': 792,
 'healthy': 715,
 'ethic': 512,
 'despicable': 404,
 'literally': 864,
 'asking': 88,
 'long': 871,
 'stood': 1419,
 'begin': 148,
 'awkwardly': 109,
 'reminded': 1218,
 'tater': 1477,
 'tots': 1540,
 'southwest': 1383,
 'helpful': 723,
 'bloddy': 175,
 'mary': 908,
 'considering': 323,
 'full': 622,
 'summarize': 1449,
 'nay': 980,
 'transcendant': 1547,
 'brings': 202,
 'joy': 813,
 'memory': 928,
 'pneumatic': 1120,
 'condiment': 321,
 'dispenser': 435,
 'kids': 824,
 'kiddos': 823,
 'cute': 370,
 'simply': 1342,
 'croutons': 358,
 'homemade': 739,
 'extra': 536,
 'plus': 1119,
 'shower': 1333,
 'outside': 1045,
 'rinse': 1242,
 'mind': 943,
 'nude': 1005,
 'hits': 736,
 'something': 1368,
 'lacking': 830,
 'quantity': 1169,
 'bother': 188,
 'hamburger': 694,
 'peanut': 1082,
 'underwhelming': 1571,
 'ninja': 995,
 'inflate': 786,
 'smaller': 1353,
 'attitudes': 98,
 'grow': 681,
 'rapidly': 1176,
 'shawarrrrrrma': 1322,
 'dinner': 417,
 'dollars': 442,
 'google': 655,
 'imagine': 775,
 'smashburger': 1354,
 'noca': 998,
 'oh': 1014,
 'nyc': 1007,
 'bagels': 118,
 'selections': 1300,
 'cheese': 267,
 'real': 1191,
 'lox': 886,
 'capers': 243,
 'serving': 1313,
 'genuinely': 636,
 'pleasant': 1116,
 'enthusiastic': 505,
 'treat': 1549,
 'duck': 471,
 'pink': 1107,
 'char': 260,
 'four': 607,
 'blue': 180,
 'shirt': 1324,
 'letting': 851,
 'final': 569,
 'blow': 178,
 'crisp': 356,
 'isn': 800,
 'small': 1352,
 'fine': 572,
 'dining': 416,
 'extremely': 538,
 'chefs': 271,
 'job': 808,
 'sexy': 1319,
 'party': 1073,
 'mouth': 961,
 're': 1188,
 'outrageously': 1043,
 'flirting': 588,
 'hottest': 751,
 'person': 1095,
 'told': 1528,
 'happened': 702,
 'hole': 737,
 'wall': 1626,
 'street': 1424,
 'salads': 1264,
 'bachi': 114,
 'friend': 612,
 'stretch': 1425,
 'imagination': 773,
 '40min': 13,
 'passed': 1074,
 'between': 157,
 'ordering': 1035,
 'arriving': 83,
 'horrible': 747,
 'waste': 1637,
 'dealing': 378,
 'world': 1690,
 'annoying': 59,
 'drunk': 469,
 'shouldn': 1332,
 '30': 10,
 'min': 942,
 'pancakes': 1066,
 'eggs': 488,
 'total': 1538,
 'letdown': 850,
 'camelback': 237,
 'flower': 590,
 'shop': 1328,
 'cartel': 251,
 'coffee': 299,
 'pace': 1057,
 'chewy': 272,
 'almost': 45,
 'excuse': 525,
 'spaghetti': 1384,
 'whatsoever': 1660,
 ...}

In [9]:
# Now we'll test all datasources using Logistic Regression as a baseline. 
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,
                                                                        y,
                                                                        test_size=0.25,
                                                                        random_state=1000)

    # Create dictionary of an sparce matrix
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))


Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

In [10]:
# A Dense nwtwork fully connected using the Sequential API. 
# Each layer has 10 neurons fully connected, and the actionation function 
# to correct the gradients it's the "ReLu". To the final layer
# The activation function it's the sigmoid one. 

from keras.models import Sequential
from keras import layers

# Definition of the features to be inputed in the model
# In this case we have 2505 features. 
input_dim = X_train.shape[1]

# The Sequencial API to add the layers
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# As we're dealing with a classification problem, we'll use 
# a loss function of binary_crossentropy, with the Adam optimizer
# to enhance the convergence and the metric to be monitored to the 
# model will be accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()

# In this mode we'll train using 50 epochs and with a small batch size of 10
# It means that for every epoch a sample of 10 records will be used to be 
# propagated through the network during the training phase.
history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

# Model Evaluation: Training
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

# Model Evaluation: Test
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


Using TensorFlow backend.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 10)                25060     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
=================================================================
Total params: 25,071
Trainable params: 25,071
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Testing Accuracy:  0.7861

In [11]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [12]:
plot_history(history)



In [13]:
'''
Now you need to tokenize the data into a format that can be used by the word embeddings. 
Keras offers a couple of convenience methods for text preprocessing and sequence 
preprocessing which you can employ to prepare your text.

You can start by using the Tokenizer utility class which can vectorize a 
text corpus into a list of integers. Each integer maps to a value in a dictionary 
that encodes the entire corpus, with the keys in the dictionary being the 
vocabulary terms themselves. You can add the parameter num_words, which is 
responsible for setting the size of the vocabulary. The most common 
num_words words will be then kept. I have the testing and training 
data prepared from the previous example:

'''
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[2])
print(X_train[2])


'''
With CountVectorizer, we had stacked vectors of word counts,
and each vector was the same length (the size of the total 
corpus vocabulary). With Tokenizer, the resulting vectors 
equal the length of each text, and the numbers don’t denote
counts, but rather correspond to the word values from the
dictionary tokenizer.word_index.
'''


I am a fan of his ... This movie sucked really bad.  
[7, 150, 2, 932, 4, 49, 6, 11, 563, 45, 30]

In [14]:
for word in ['the', 'all', 'good', 'nice']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))


the: 1
all: 27
good: 33
nice: 257

In [15]:
# Here we can see our dictionaty where the words are the keys of our
# dictionary map, and the order of the words was based in their frequency
tokenizer.word_index


Out[15]:
{'the': 1,
 'a': 2,
 'and': 3,
 'of': 4,
 'is': 5,
 'this': 6,
 'i': 7,
 'it': 8,
 'to': 9,
 'in': 10,
 'movie': 11,
 'was': 12,
 'film': 13,
 'that': 14,
 '0': 15,
 'for': 16,
 'as': 17,
 '1': 18,
 'but': 19,
 'with': 20,
 'not': 21,
 'are': 22,
 'on': 23,
 "it's": 24,
 'one': 25,
 'you': 26,
 'all': 27,
 'so': 28,
 'just': 29,
 'bad': 30,
 'at': 31,
 'very': 32,
 'good': 33,
 'an': 34,
 'out': 35,
 'there': 36,
 'be': 37,
 'by': 38,
 'time': 39,
 'like': 40,
 'have': 41,
 'or': 42,
 'about': 43,
 'great': 44,
 'really': 45,
 'from': 46,
 'even': 47,
 'characters': 48,
 'his': 49,
 'who': 50,
 'if': 51,
 'more': 52,
 'see': 53,
 'acting': 54,
 'were': 55,
 '10': 56,
 'has': 57,
 'my': 58,
 'some': 59,
 'well': 60,
 'only': 61,
 'no': 62,
 'he': 63,
 'because': 64,
 'most': 65,
 'when': 66,
 "don't": 67,
 'its': 68,
 'how': 69,
 'story': 70,
 'movies': 71,
 'best': 72,
 'other': 73,
 "didn't": 74,
 'plot': 75,
 'character': 76,
 'can': 77,
 'also': 78,
 'than': 79,
 'real': 80,
 'up': 81,
 'ever': 82,
 'seen': 83,
 'think': 84,
 'your': 85,
 'love': 86,
 'make': 87,
 'what': 88,
 'me': 89,
 'any': 90,
 'do': 91,
 'they': 92,
 'which': 93,
 'will': 94,
 'better': 95,
 'made': 96,
 'watching': 97,
 'had': 98,
 'way': 99,
 'every': 100,
 'her': 101,
 'could': 102,
 'funny': 103,
 'too': 104,
 'their': 105,
 'would': 106,
 'never': 107,
 'them': 108,
 'wonderful': 109,
 'actors': 110,
 'scenes': 111,
 'still': 112,
 'much': 113,
 'look': 114,
 'writing': 115,
 'worth': 116,
 'over': 117,
 'know': 118,
 'many': 119,
 'into': 120,
 'cast': 121,
 'work': 122,
 'totally': 123,
 'little': 124,
 'anyone': 125,
 'music': 126,
 'films': 127,
 'here': 128,
 'people': 129,
 'such': 130,
 'those': 131,
 'after': 132,
 'script': 133,
 'show': 134,
 'scene': 135,
 'stupid': 136,
 'watch': 137,
 'awful': 138,
 'been': 139,
 'two': 140,
 'screen': 141,
 'now': 142,
 'line': 143,
 'years': 144,
 'go': 145,
 'thought': 146,
 'excellent': 147,
 'waste': 148,
 'things': 149,
 'am': 150,
 'short': 151,
 'everything': 152,
 'get': 153,
 'thing': 154,
 'liked': 155,
 'boring': 156,
 'should': 157,
 'we': 158,
 'piece': 159,
 'part': 160,
 'right': 161,
 'both': 162,
 'did': 163,
 'saw': 164,
 'horror': 165,
 'special': 166,
 "i've": 167,
 'though': 168,
 'dialogue': 169,
 'definitely': 170,
 'does': 171,
 'enough': 172,
 'game': 173,
 'pretty': 174,
 'minutes': 175,
 'say': 176,
 'worst': 177,
 'directing': 178,
 'going': 179,
 'makes': 180,
 'man': 181,
 'give': 182,
 'predictable': 183,
 'actually': 184,
 'terrible': 185,
 'nothing': 186,
 'ending': 187,
 "can't": 188,
 'she': 189,
 'interesting': 190,
 'absolutely': 191,
 'believe': 192,
 'being': 193,
 'quite': 194,
 'recommend': 195,
 'avoid': 196,
 'sucks': 197,
 'least': 198,
 'however': 199,
 'wasted': 200,
 'beautiful': 201,
 'camera': 202,
 'certainly': 203,
 'cinematography': 204,
 'actor': 205,
 'performance': 206,
 'long': 207,
 'find': 208,
 "that's": 209,
 'our': 210,
 'played': 211,
 'truly': 212,
 'art': 213,
 'especially': 214,
 'end': 215,
 "doesn't": 216,
 'around': 217,
 'play': 218,
 'life': 219,
 'old': 220,
 'simply': 221,
 'these': 222,
 'effects': 223,
 'suspense': 224,
 'again': 225,
 'off': 226,
 'through': 227,
 'throughout': 228,
 'editing': 229,
 'self': 230,
 'far': 231,
 'written': 232,
 'memorable': 233,
 'lot': 234,
 'probably': 235,
 'kids': 236,
 'loved': 237,
 'black': 238,
 'white': 239,
 'understand': 240,
 'got': 241,
 'worse': 242,
 'hilarious': 243,
 'feeling': 244,
 'enjoyed': 245,
 'big': 246,
 'family': 247,
 'john': 248,
 "there's": 249,
 'believable': 250,
 'almost': 251,
 'own': 252,
 'comedy': 253,
 'must': 254,
 'top': 255,
 'times': 256,
 'nice': 257,
 'together': 258,
 'few': 259,
 'each': 260,
 'used': 261,
 "won't": 262,
 'myself': 263,
 'before': 264,
 'put': 265,
 'tom': 266,
 'terrific': 267,
 'annoying': 268,
 'cheap': 269,
 'playing': 270,
 'lines': 271,
 'want': 272,
 'him': 273,
 'another': 274,
 'experience': 275,
 'kind': 276,
 'style': 277,
 'action': 278,
 'clever': 279,
 'then': 280,
 'different': 281,
 "i'm": 282,
 'disappointed': 283,
 'anything': 284,
 'fast': 285,
 'classic': 286,
 'second': 287,
 'job': 288,
 'rent': 289,
 'garbage': 290,
 'director': 291,
 'series': 292,
 'budget': 293,
 'highly': 294,
 'cinema': 295,
 'rather': 296,
 'drama': 297,
 'fact': 298,
 'cult': 299,
 'non': 300,
 'whatever': 301,
 'human': 302,
 'money': 303,
 'sound': 304,
 "wasn't": 305,
 'poor': 306,
 'amount': 307,
 'portrayal': 308,
 'amazing': 309,
 'keep': 310,
 'year': 311,
 'away': 312,
 'overall': 313,
 'mind': 314,
 'created': 315,
 'half': 316,
 'hours': 317,
 'involved': 318,
 'ups': 319,
 'entire': 320,
 'point': 321,
 'use': 322,
 'joy': 323,
 "i'd": 324,
 'pg': 325,
 'found': 326,
 'during': 327,
 'production': 328,
 'making': 329,
 'gives': 330,
 'since': 331,
 'attempt': 332,
 'down': 333,
 'everyone': 334,
 'mess': 335,
 'generally': 336,
 'seem': 337,
 'slow': 338,
 'seeing': 339,
 'itself': 340,
 'looked': 341,
 'song': 342,
 'cool': 343,
 'star': 344,
 'become': 345,
 'gave': 346,
 'full': 347,
 'first': 348,
 'guess': 349,
 'started': 350,
 'parts': 351,
 'works': 352,
 'book': 353,
 'lead': 354,
 'try': 355,
 'whole': 356,
 'holes': 357,
 'something': 358,
 'oh': 359,
 'back': 360,
 'whether': 361,
 'appreciate': 362,
 'subtle': 363,
 'particularly': 364,
 'maybe': 365,
 'entertaining': 366,
 'felt': 367,
 'gets': 368,
 'recommended': 369,
 'death': 370,
 'tv': 371,
 'girl': 372,
 'where': 373,
 'world': 374,
 'easily': 375,
 'between': 376,
 'fun': 377,
 'scamp': 378,
 'care': 379,
 'follow': 380,
 "they're": 381,
 'glad': 382,
 'bit': 383,
 '2': 384,
 'minute': 385,
 'costs': 386,
 'mediocre': 387,
 'lovely': 388,
 'yet': 389,
 'checking': 390,
 'let': 391,
 'frightening': 392,
 'barely': 393,
 'us': 394,
 'rate': 395,
 'scale': 396,
 'audience': 397,
 'close': 398,
 'genuine': 399,
 'scenery': 400,
 'moment': 401,
 'including': 402,
 'faux': 403,
 'footage': 404,
 "i'll": 405,
 '\x96': 406,
 'came': 407,
 'free': 408,
 'bought': 409,
 'ready': 410,
 'crap': 411,
 "wouldn't": 412,
 'gem': 413,
 'rating': 414,
 'lame': 415,
 'sometimes': 416,
 'happened': 417,
 'word': 418,
 'insult': 419,
 'remake': 420,
 'shows': 421,
 'without': 422,
 'said': 423,
 'animation': 424,
 'same': 425,
 'share': 426,
 'level': 427,
 'quality': 428,
 'sub': 429,
 'recent': 430,
 'looks': 431,
 'role': 432,
 'period': 433,
 'fine': 434,
 'average': 435,
 'finally': 436,
 'brilliance': 437,
 'moving': 438,
 'young': 439,
 'day': 440,
 'watchable': 441,
 'photography': 442,
 'features': 443,
 'brilliant': 444,
 'already': 445,
 'thrown': 446,
 'whatsoever': 447,
 'expect': 448,
 'serious': 449,
 'come': 450,
 'head': 451,
 'showed': 452,
 'exactly': 453,
 'clichés': 454,
 'strong': 455,
 'provoking': 456,
 'seems': 457,
 'tell': 458,
 'charming': 459,
 'stories': 460,
 'less': 461,
 'although': 462,
 'wonderfully': 463,
 'south': 464,
 'heart': 465,
 'intelligent': 466,
 'face': 467,
 'why': 468,
 'possible': 469,
 'emotions': 470,
 'talk': 471,
 'true': 472,
 'god': 473,
 'pathetic': 474,
 '90': 475,
 'mean': 476,
 'storyline': 477,
 "90's": 478,
 'child': 479,
 'shots': 480,
 'giallo': 481,
 'eyes': 482,
 'ruthless': 483,
 'last': 484,
 'having': 485,
 'perfectly': 486,
 'nobody': 487,
 'watched': 488,
 'low': 489,
 'ability': 490,
 'left': 491,
 'stage': 492,
 'scared': 493,
 'beyond': 494,
 'main': 495,
 'dance': 496,
 'stars': 497,
 'fans': 498,
 'adorable': 499,
 '8': 500,
 'score': 501,
 'shot': 502,
 'presents': 503,
 'comes': 504,
 'oscar': 505,
 'completely': 506,
 'intelligence': 507,
 'incredibly': 508,
 'songs': 509,
 'pure': 510,
 'documentary': 511,
 'hope': 512,
 'happen': 513,
 'next': 514,
 "aren't": 515,
 'problems': 516,
 'tale': 517,
 'writer': 518,
 'night': 519,
 'sea': 520,
 'cartoon': 521,
 'women': 522,
 'indulgent': 523,
 'age': 524,
 'angel': 525,
 'silent': 526,
 'thinking': 527,
 'seriously': 528,
 'obviously': 529,
 'sense': 530,
 'particular': 531,
 'convincing': 532,
 "you'll": 533,
 'enjoy': 534,
 'spent': 535,
 'perfect': 536,
 "haven't": 537,
 'ridiculous': 538,
 'bored': 539,
 'children': 540,
 'rest': 541,
 'casting': 542,
 'often': 543,
 'martin': 544,
 'complete': 545,
 'words': 546,
 'lots': 547,
 'today': 548,
 'important': 549,
 'history': 550,
 'may': 551,
 'hard': 552,
 'graphics': 553,
 'massive': 554,
 'take': 555,
 'negative': 556,
 'performances': 557,
 'energy': 558,
 'values': 559,
 'wish': 560,
 'hour': 561,
 'puppets': 562,
 'sucked': 563,
 'store': 564,
 'adaptation': 565,
 'dead': 566,
 'reality': 567,
 "couldn't": 568,
 'sure': 569,
 'lost': 570,
 'whom': 571,
 'walked': 572,
 'fails': 573,
 'create': 574,
 'plus': 575,
 'paced': 576,
 'run': 577,
 'appalling': 578,
 'yeah': 579,
 'someone': 580,
 'knew': 581,
 'narrative': 582,
 'chemistry': 583,
 'house': 584,
 'remotely': 585,
 'exquisite': 586,
 'visual': 587,
 'composition': 588,
 'wonder': 589,
 'torture': 590,
 'dvd': 591,
 'player': 592,
 'turned': 593,
 'stay': 594,
 'son': 595,
 'cost': 596,
 'five': 597,
 'sets': 598,
 'place': 599,
 'episode': 600,
 'set': 601,
 'eye': 602,
 '13': 603,
 'occupied': 604,
 'instead': 605,
 'terms': 606,
 'aspect': 607,
 'takes': 608,
 'hand': 609,
 'feel': 610,
 'redeeming': 611,
 'learn': 612,
 'american': 613,
 'brain': 614,
 'italian': 615,
 'called': 616,
 'small': 617,
 'warmth': 618,
 'heaven': 619,
 'unpredictable': 620,
 'predictably': 621,
 'sleep': 622,
 'empty': 623,
 'hollow': 624,
 'awesome': 625,
 'else': 626,
 'dedication': 627,
 'hands': 628,
 'loneliness': 629,
 'turns': 630,
 'brief': 631,
 'quinn': 632,
 'century': 633,
 'leave': 634,
 'christmas': 635,
 'voice': 636,
 'depicts': 637,
 'hated': 638,
 'decent': 639,
 'list': 640,
 'pleased': 641,
 'modern': 642,
 'genius': 643,
 'drive': 644,
 'seemed': 645,
 'freedom': 646,
 '9': 647,
 '3': 648,
 'twist': 649,
 'given': 650,
 "miyazaki's": 651,
 'form': 652,
 'continuity': 653,
 'reason': 654,
 'directorial': 655,
 'opened': 656,
 'hear': 657,
 'mercy': 658,
 'killing': 659,
 'horrible': 660,
 'fear': 661,
 'trying': 662,
 'keeps': 663,
 'running': 664,
 'jamie': 665,
 'problem': 666,
 'heard': 667,
 'unbelievable': 668,
 'bear': 669,
 'kinda': 670,
 'cute': 671,
 'question': 672,
 'ask': 673,
 'hollywood': 674,
 'sand': 675,
 'masterpieces': 676,
 'supposed': 677,
 'shed': 678,
 'situation': 679,
 'dislike': 680,
 'course': 681,
 'appearance': 682,
 'africa': 683,
 'process': 684,
 'sentiment': 685,
 'revealing': 686,
 'terribly': 687,
 'deserving': 688,
 'killer': 689,
 'original': 690,
 'actress': 691,
 'pictures': 692,
 'games': 693,
 'forget': 694,
 'bore': 695,
 'whiny': 696,
 'thoroughly': 697,
 'produced': 698,
 'likes': 699,
 'single': 700,
 'deserved': 701,
 'lazy': 702,
 'explain': 703,
 'neil': 704,
 'interest': 705,
 'unfunny': 706,
 'hackneyed': 707,
 'direction': 708,
 'poorly': 709,
 'done': 710,
 'towards': 711,
 'cartoons': 712,
 'badly': 713,
 'early': 714,
 'future': 715,
 'fulci': 716,
 'addition': 717,
 'genre': 718,
 'incredible': 719,
 "'cover": 720,
 "girl'": 721,
 'wind': 722,
 'lion': 723,
 'told': 724,
 'member': 725,
 "huston's": 726,
 'steve': 727,
 'conclusion': 728,
 'bother': 729,
 'nonsense': 730,
 'sequel': 731,
 'fantastic': 732,
 'dogs': 733,
 'business': 734,
 'beautifully': 735,
 'noteworthy': 736,
 'despite': 737,
 'brian': 738,
 'unrecognizable': 739,
 'explanation': 740,
 'disliked': 741,
 'overly': 742,
 'plays': 743,
 'trash': 744,
 'released': 745,
 'etc': 746,
 'wrong': 747,
 'forces': 748,
 'solid': 749,
 'final': 750,
 "he's": 751,
 'hate': 752,
 'proud': 753,
 'theater': 754,
 'under': 755,
 'humorous': 756,
 'roles': 757,
 'era': 758,
 'accused': 759,
 'mostly': 760,
 'decay': 761,
 'hell': 762,
 'fresh': 763,
 'indeed': 764,
 'unfortunately': 765,
 'tension': 766,
 'conflict': 767,
 'along': 768,
 'delight': 769,
 'start': 770,
 'talented': 771,
 'ways': 772,
 'setting': 773,
 'imaginable': 774,
 'cover': 775,
 'new': 776,
 'cannot': 777,
 'emily': 778,
 'nuts': 779,
 'premise': 780,
 'uses': 781,
 'dreams': 782,
 'touching': 783,
 'previous': 784,
 'guys': 785,
 'entirely': 786,
 'macbeth': 787,
 'blood': 788,
 'evil': 789,
 'viewing': 790,
 'super': 791,
 'crowd': 792,
 'pleaser': 793,
 'ranks': 794,
 'among': 795,
 'looking': 796,
 'relationships': 797,
 'core': 798,
 'none': 799,
 'laugh': 800,
 'dark': 801,
 'undoubtedly': 802,
 'mention': 803,
 'superb': 804,
 'asleep': 805,
 'masterful': 806,
 'occasionally': 807,
 'versus': 808,
 'balance': 809,
 'coming': 810,
 '20': 811,
 'underneath': 812,
 'attempts': 813,
 'pitiful': 814,
 'basically': 815,
 'overacting': 816,
 'space': 817,
 'journey': 818,
 'imagination': 819,
 'mickey': 820,
 'mouse': 821,
 'crazy': 822,
 'pull': 823,
 'daughter': 824,
 'change': 825,
 'delivering': 826,
 'guy': 827,
 'received': 828,
 'wayne': 829,
 'industry': 830,
 'presence': 831,
 "isn't": 832,
 'owned': 833,
 'ten': 834,
 'thoughts': 835,
 'opening': 836,
 'sequence': 837,
 'sake': 838,
 'mark': 839,
 'possibly': 840,
 'depth': 841,
 'call': 842,
 'assistant': 843,
 'slightest': 844,
 'lacks': 845,
 'alexander': 846,
 'tremendously': 847,
 '4': 848,
 'wilkinson': 849,
 'huge': 850,
 'speed': 851,
 "joe's": 852,
 'friends': 853,
 'creates': 854,
 'angles': 855,
 'moral': 856,
 'fall': 857,
 'cardboard': 858,
 'stereotypes': 859,
 'bold': 860,
 'father': 861,
 'murdered': 862,
 'war': 863,
 'engaging': 864,
 "master's": 865,
 'lives': 866,
 'interplay': 867,
 'emilio': 868,
 'advise': 869,
 'until': 870,
 'discovering': 871,
 '30': 872,
 'achievement': 873,
 'cheesy': 874,
 'unconvincing': 875,
 'surprisingly': 876,
 'jean': 877,
 'remember': 878,
 'meaning': 879,
 'always': 880,
 'fifteen': 881,
 'pretentious': 882,
 'masterpiece': 883,
 'morgan': 884,
 'references': 885,
 'yes': 886,
 'fit': 887,
 'picture': 888,
 'buy': 889,
 'flick': 890,
 'idea': 891,
 'while': 892,
 'jonah': 893,
 'hill': 894,
 'obvious': 895,
 'perhaps': 896,
 'choice': 897,
 'front': 898,
 'pointless': 899,
 'utterly': 900,
 'reading': 901,
 'charm': 902,
 'honestly': 903,
 'gonna': 904,
 'pacing': 905,
 'surrounding': 906,
 'junkyard': 907,
 'example': 908,
 'usual': 909,
 'balanced': 910,
 'laughable': 911,
 'dumb': 912,
 'zombie': 913,
 'smart': 914,
 'costumes': 915,
 'thriller': 916,
 'ponyo': 917,
 'stinks': 918,
 'weak': 919,
 'spoil': 920,
 'either': 921,
 'starts': 922,
 'plain': 923,
 'act': 924,
 'write': 925,
 'damn': 926,
 'betty': 927,
 'earth': 928,
 'bring': 929,
 'dull': 930,
 'girlfriend': 931,
 'fan': 932,
 'lifetime': 933,
 'air': 934,
 'knows': 935,
 'sells': 936,
 'comprehensible': 937,
 'reminded': 938,
 'huston': 939,
 'evinced': 940,
 'faithful': 941,
 'james': 942,
 "joyce's": 943,
 'acclaimed': 944,
 'novella': 945,
 'zillion': 946,
 'bendingly': 947,
 'flat': 948,
 'nearly': 949,
 'case': 950,
 "'so": 951,
 "laughable'": 952,
 'suited': 953,
 'relatively': 954,
 'string': 955,
 'walk': 956,
 'theatre': 957,
 'relief': 958,
 'critical': 959,
 'proceedings': 960,
 'america': 961,
 'above': 962,
 'inventive': 963,
 'elegant': 964,
 'angle': 965,
 'lighting': 966,
 'pointillistic': 967,
 'home': 968,
 'behold': 969,
 'halfway': 970,
 'embarrassed': 971,
 'howell': 972,
 'dustin': 973,
 "hoffman's": 974,
 'tickets': 975,
 'dollars': 976,
 'mad': 977,
 'paid': 978,
 '7': 979,
 '50': 980,
 'thunderbirds': 981,
 'attractive': 982,
 'pleasing': 983,
 'lewis': 984,
 "black's": 985,
 'considerable': 986,
 'talent': 987,
 'incendiary': 988,
 'unrestrained': 989,
 'allow': 990,
 'peaking': 991,
 'paper': 992,
 'columbo': 993,
 'against': 994,
 'screenplay': 995,
 'post': 996,
 'loosely': 997,
 'public': 998,
 'kris': 999,
 'kristoffersen': 1000,
 ...}

In [16]:
'''
One problem that we have is that each text sequence has in most cases different 
length of words. To counter this, you can use pad_sequence() which simply 
pads the sequence of words with zeros. By default, it prepends zeros but 
we want to append them. Typically it does not matter whether you prepend or append zeros.

Additionally you would want to add a maxlen parameter to specify how 
long the sequences should be. This cuts sequences that exceed that number. 
In the following code, you can see how to pad sequences with Keras
'''
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print(X_train[0, :])


[170 116 390  35   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]

In [17]:
'''
Now you can use the Embedding Layer of Keras which takes the 
previously calculated integers and maps them to a dense vector of the 
embedding. You will need the following parameters:

- input_dim: the size of the vocabulary
- output_dim: the size of the dense vector
- input_length: the length of the sequence

With the Embedding layer we have now a couple of options. One way 
would be to take the output of the embedding layer and plug it into 
a Dense layer. In order to do this you have to add a Flatten layer 
in between that prepares the sequential input for the Dense layer:
'''

from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

'''
You can now see that we have 87350 new parameters to train. 
This number comes from vocab_size times the embedding_dim. 
These weights of the embedding layer are initialized with random 
weights and are then adjusted through backpropagation during training. 

This model takes the words as they come in the order of the 
sentences as input vectors. You can train it with the following:
'''
history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

plot_history(history)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 100, 50)           128750    
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
=================================================================
Total params: 178,771
Trainable params: 178,771
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Testing Accuracy:  0.6845

In [18]:
'''
Another way to work with embeddings is by using a MaxPooling1D/AveragePooling1D 
or a GlobalMaxPooling1D/GlobalAveragePooling1D layer after the 
embedding. You can think of the pooling layers as a way to 
downsample (a way to reduce the size of) the incoming feature vectors.

In the case of max pooling you take the maximum value of all
features in the pool for each feature dimension. In the case
of average pooling you take the average, but max pooling seems 
to be more commonly used as it highlights large values.

Global max/average pooling takes the maximum/average of all 
features whereas in the other case you have to define the 
pool size. Keras has again its own layer that you can add in the sequential model:
'''

from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

plot_history(history)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 100, 50)           128750    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
=================================================================
Total params: 129,271
Trainable params: 129,271
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Testing Accuracy:  0.7594

In [19]:
'''
The word embeddings do not understand the text as a human would, but they rather map
the statistical structure of the language used in the corpus. Their aim is to map 
semantic meaning into a geometric space. This geometric space is then 
called the embedding space.

'''
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [20]:
# Retrieve the embedding matrix
embedding_dim = 50

embedding_matrix = create_embedding_matrix('/Users/flavioclesio/Desktop/programming-study/machine-learning/nlp/glove.6B/glove.6B.50d.txt',
                                           tokenizer.word_index,
                                           embedding_dim)

In [21]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size


Out[21]:
0.9522330097087378

In [22]:
'''
Let’s have a look at the performance when using the 
GlobalMaxPool1D layer:
'''
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

plot_history(history)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, 100, 50)           128750    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
=================================================================
Total params: 129,271
Trainable params: 521
Non-trainable params: 128,750
_________________________________________________________________
Training Accuracy: 0.7647
Testing Accuracy:  0.7647

In [24]:
'''
Since the word embeddings are not additionally trained, it is expected to be lower. 
But let’s now see how this performs if we allow the embedding to be 
trained by using trainable=True:
'''

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=250,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

plot_history(history)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 100, 50)           128750    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 50)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
=================================================================
Total params: 129,271
Trainable params: 129,271
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Testing Accuracy:  0.8075

In [26]:
# CNN
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

plot_history(history)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_6 (Embedding)      (None, 100, 100)          257500    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 11        
=================================================================
Total params: 322,929
Trainable params: 322,929
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Testing Accuracy:  0.7594

In [27]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [28]:
param_grid = dict(num_filters=[32, 64],
                  kernel_size=[5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])

In [29]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'sentiment-labelled-sentences/output.txt'

# Run grid search for each source (yelp, amazon, imdb)
for source, frame in df.groupby('source'):
    print('Running grid search for data set :', source)
    sentences = df['sentence'].values
    y = df['label'].values

    # Train-test split
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    # Tokenize words
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)

    # Adding 1 because of reserved 0 index
    vocab_size = len(tokenizer.word_index) + 1

    # Pad sequences with zeros
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

    # Parameter grid for grid search
    param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen])
    
    model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=False)
    
    grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=1, n_iter=5)
    
    grid_result = grid.fit(X_train, y_train)

    # Evaluate testing set
    test_accuracy = grid.score(X_test, y_test)

    # Save and evaluate results
    prompt = input(f'finished {source}; write to file and proceed? [y/n]')
    if prompt.lower() not in {'y', 'true', 'yes'}:
        break
    with open(output_file, 'a') as f:
        s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
        output_string = s.format(
            source,
            grid_result.best_score_,
            grid_result.best_params_,
            test_accuracy)
        print(output_string)
        f.write(output_string)


Running grid search for data set : amazon
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 15.8min finished
finished amazon; write to file and proceed? [y/n]y
Running amazon data set
Best Accuracy : 0.8205
{'vocab_size': 4603, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 3, 'embedding_dim': 50}
Test Accuracy : 0.8472


Running grid search for data set : imdb
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 17.9min finished
finished imdb; write to file and proceed? [y/n]y
Running imdb data set
Best Accuracy : 0.8200
{'vocab_size': 4603, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 3, 'embedding_dim': 50}
Test Accuracy : 0.8384


Running grid search for data set : yelp
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 19.8min finished
finished yelp; write to file and proceed? [y/n]y
Running yelp data set
Best Accuracy : 0.8224
{'vocab_size': 4603, 'num_filters': 128, 'maxlen': 100, 'kernel_size': 3, 'embedding_dim': 50}
Test Accuracy : 0.8268